反卷積層與自編碼器

林嶔 (Lin, Chin)

Lesson 10

自編碼器(1)

– 但目前為止它能夠應用的場景仍然太少了,我們開始教大家相關的技術能夠運用到哪些地方

F10_3

自編碼器(2)

– 讓我們再次利用MNIST的手寫數字資料進行實作。請在這裡下載訓練集的資料,並在這裡下載測試集的資料:

library(data.table)

Train.DAT = fread("data/train_data.csv", data.table = FALSE)
Test.DAT = fread("data/test_data.csv", data.table = FALSE)

Train.X = t(Train.DAT[,-1])
dim(Train.X) = c(28, 28, 1, ncol(Train.X))
Train.X = Train.X/255
Train.Y = Train.DAT[,1]

Test.X = t(Test.DAT[,-1])
dim(Test.X) = c(28, 28, 1, ncol(Test.X))
Test.X = Test.X/255
Test.Y = Test.DAT[,1]

– 再複習一次檔案結構:

library(OpenImageR)

imageShow(t(Train.X[,,,1]))

自編碼器(3)

library(mxnet)

my_iterator_func <- setRefClass("Custom_Iter1",
                                fields = c("iter", "data.csv", "data.shape", "batch.size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, data.csv, data.shape, batch.size){
                                    csv_iter <- mx.io.CSVIter(data.csv = data.csv, data.shape = data.shape, batch.size = batch.size)
                                    .self$iter <- csv_iter
                                    .self
                                  },
                                  value = function(){
                                    val <- as.array(.self$iter$value()$data)
                                    val.x <- val[-1,]
                                    dim(val.x) <- c(28, 28, 1, ncol(val.x))
                                    val.x <- val.x/255
                                    val.x <- mx.nd.array(val.x)
                                    val.y <- val.x
                                    list(data=val.x, label=val.y)
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter1 = my_iterator_func(iter = NULL,  data.csv = 'data/train_data.csv', data.shape = 785, batch.size = 20)

– 我們再看一次這個Iterator怎樣使用:

my_iter1$reset()
my_iter1$iter.next()
## [1] TRUE
my_value = my_iter1$value()

library(OpenImageR)

imageShow(matrix(as.numeric(as.array(my_value$data)[,,,1]), nrow = 28, byrow = TRUE))

自編碼器(4)

– 需要特別注意的是,為了確保我們的Encoder是具有壓縮的感覺,每一層的數值總數都必須小於前一層!

# Encoder

data <- mx.symbol.Variable('data')

fc1 <- mx.symbol.FullyConnected(data = data, num.hidden = 128, name = 'fc1')
relu1 <- mx.symbol.Activation(data = fc1, act_type = "relu", name = 'relu1')

encoder <- mx.symbol.FullyConnected(data = relu1, num.hidden = 32, name = 'encoder')

# Decoder

fc3 <- mx.symbol.FullyConnected(data = encoder, num.hidden = 128, name = 'fc3')
relu3 <- mx.symbol.Activation(data = fc3, act_type = "relu", name = 'relu3')

fc4 <- mx.symbol.FullyConnected(data = relu3, num.hidden = 784, name = 'fc4')

decoder <- mx.symbol.reshape(data = fc4, shape = c(28, 28, 1, -1), name = 'decoder')

# MSE loss

label <- mx.symbol.Variable(name = 'label')

residual <- mx.symbol.broadcast_minus(lhs = label, rhs = decoder) 
square_residual <- mx.symbol.square(data = residual)
mean_square_residual <- mx.symbol.mean(data = square_residual, axis = 0:3, keepdims = FALSE)
mse_loss <- mx.symbol.MakeLoss(data = mean_square_residual, name = 'mse')

自編碼器(5)

my_optimizer <- mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999, wd = 1e-4)
my.model.FeedForward.create = function (Iterator, ctx = mx.cpu(), save.grad = FALSE,
                                        loss_symbol, pred_symbol,
                                        Optimizer, num_round = 20) {
  
  require(abind)
  
  out_round <- unique(c(1:5, round(quantile(1:num_round, 1:30/30))))
  
  #0. Check data shape
  Iterator$reset()
  Iterator$iter.next()
  my_values <- Iterator$value()
  input_shape <- lapply(my_values, dim)
  batch_size <- tail(input_shape[[1]], 1)
  
  #1. Build an executor to train model
  exec_list = list(symbol = loss_symbol, ctx = ctx, grad.req = "write")
  exec_list = append(exec_list, input_shape)
  my_executor = do.call(mx.simple.bind, exec_list)
  
  #2. Set the initial parameters
  mx.set.seed(0)
  new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
                                         input.shape = input_shape,
                                         output.shape = NULL,
                                         initializer = mxnet:::mx.init.uniform(0.01),
                                         ctx = ctx)
  mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
  mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
  
  #3. Define the updater
  my_updater = mx.opt.get.updater(optimizer = Optimizer, weights = my_executor$ref.arg.arrays)
  
  #4. Forward/Backward
  message('Start training:')
  
  set.seed(0)
  if (save.grad) {epoch_grad = NULL}
  
  for (i in 1:num_round) {
    
    Iterator$reset()
    batch_loss = list()
    if (save.grad) {batch_grad = list()}
    batch_seq = 0
    t0 = Sys.time()
    
    while (Iterator$iter.next()) {
      
      my_values <- Iterator$value()
      mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
      mx.exec.forward(my_executor, is.train = TRUE)
      mx.exec.backward(my_executor)
      update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
      mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
      batch_loss[[length(batch_loss) + 1]] = as.array(my_executor$ref.outputs[[1]])
      if (save.grad) {
        grad_list = sapply(my_executor$ref.grad.arrays, function (x) {if (!is.null(x)) {mean(abs(as.array(x)))}})
        grad_list = unlist(grad_list[grepl('weight', names(grad_list), fixed = TRUE) & !grepl('out', names(grad_list), fixed = TRUE)])
        batch_grad[[length(batch_grad) + 1]] = grad_list
      }
      batch_seq = batch_seq + 1
      
    }
    
    if (i %in% out_round) {
      message(paste0("epoch = ", i,
                     ": loss = ", formatC(mean(unlist(batch_loss)), format = "f", 4),
                     " (Speed: ", formatC(batch_seq * batch_size/as.numeric(Sys.time() - t0, units = 'secs'), format = "f", 2), " sample/secs)"))
    }
    
    if (save.grad) {epoch_grad = rbind(epoch_grad, apply(abind(batch_grad, along = 2), 1, mean))}
    
  }
  
  if (save.grad) {
    
    epoch_grad[epoch_grad < 1e-8] = 1e-8
    
    COL = rainbow(ncol(epoch_grad))
    random_pos = 2^runif(ncol(epoch_grad), -0.5, 0.5)
    
    plot(epoch_grad[,1] * random_pos[1], type = 'l', col = COL[1],
         xlab = 'epoch', ylab = 'mean of abs(grad)', log = 'y',
         ylim = range(epoch_grad))
    
    for (i in 2:ncol(epoch_grad)) {lines(1:nrow(epoch_grad), epoch_grad[,i] * random_pos[i], col = COL[i])}
    
    legend('topright', paste0('layer', 1:ncol(epoch_grad), '_weight'), col = COL, lwd = 1)
    
  }
  
  #5. Get model
  my_model <- mxnet:::mx.model.extract.model(symbol = pred_symbol,
                                             train.execs = list(my_executor))
  
  return(my_model)
  
}
model <- my.model.FeedForward.create(Iterator = my_iter1, ctx = mx.cpu(), save.grad = FALSE,
                                     loss_symbol = mse_loss, pred_symbol = decoder,
                                     Optimizer = my_optimizer, num_round = 20)

自編碼器(6)

unzip_pred <- predict(model, Test.X)
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0

library(imager)

par(mar=rep(0,4), mfcol = c(4, 5))

for (i in 1:10) {
  
  plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
  rasterImage(t(Test.X[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
  
  plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
  rasterImage(t(unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
  
}

練習1:試著把壓縮模型與解壓縮模型分離開來

– 除此之外,你也能嘗試看看隨便給一串32個數字,測試一下解壓縮模型能幫你解碼成什麼東西!

練習1答案(1)

– 想要分離壓縮模型並不困難,需要用到我們之前做轉移特徵學習類似的方式:

all_layers <- model$symbol$get.internals()
encoder_output <- which(all_layers$outputs == 'encoder_output') %>% all_layers$get.output()

encoder_model <- model
encoder_model$symbol <- encoder_output
encoder_model$arg.params <- encoder_model$arg.params[names(encoder_model$arg.params) %in% names(mx.symbol.infer.shape(encoder_output, data = c(28, 28, 1, 7))$arg.shapes)]
encoder_model$aux.params <- encoder_model$aux.params[names(encoder_model$aux.params) %in% names(mx.symbol.infer.shape(encoder_output, data = c(28, 28, 1, 7))$aux.shapes)]
zip_code <- predict(encoder_model, Test.X)
dim(zip_code)
## [1]    32 16800

練習1答案(2)

# Decoder

data <- mx.symbol.Variable('data')

fc3 <- mx.symbol.FullyConnected(data = data, num.hidden = 128, name = 'fc3')
relu3 <- mx.symbol.Activation(data = fc3, act_type = "relu", name = 'relu3')

fc4 <- mx.symbol.FullyConnected(data = relu3, num.hidden = 784, name = 'fc4')

decoder_output <- mx.symbol.reshape(data = fc4, shape = c(28, 28, 1, -1), name = 'decoder')
decoder_model <- model
decoder_model$symbol <- decoder_output
decoder_model$arg.params <- decoder_model$arg.params[names(decoder_model$arg.params) %in% names(mx.symbol.infer.shape(decoder_output, data = c(32, 7))$arg.shapes)]
decoder_model$aux.params <- decoder_model$aux.params[names(decoder_model$aux.params) %in% names(mx.symbol.infer.shape(decoder_output, data = c(32, 7))$aux.shapes)]
unzip_pred <- predict(decoder_model, zip_code, array.layout = 'colmajor')
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0

library(imager)

par(mar=rep(0,4), mfcol = c(4, 5))

for (i in 1:20) {
  plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
  rasterImage(t(unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
}

練習1引申討論(1)

randon_zip_code <- array(rnorm(320, sd = 3), dim = c(32, 10))

unzip_pred <- predict(decoder_model, randon_zip_code)
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0

library(imager)

par(mar=rep(0,4), mfcol = c(2, 5))

for (i in 1:10) {
  plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
  rasterImage(t(unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
}

練習1引申討論(2)

test_array <- Test.X

test_array <- test_array + rnorm(prod(dim(test_array)), sd = 0.3)
test_array[test_array > 1] <- 1
test_array[test_array < 0] <- 0

unzip_pred <- predict(model, test_array)
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0

library(imager)

par(mar=rep(0,4), mfcol = c(4, 5))

for (i in 1:10) {
  
  plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
  rasterImage(t(test_array[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
  
  plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
  rasterImage(t(unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
  
}

反卷積層(1)

– 在今天之前,我們所有使用到的卷積層都只能把特徵圖縮小(下採樣,down sampling),這在對於圖像分類並不會有太大的問題,但對於其他任務來說操作就比較受限了。

F10_2

反卷積層(2)

X <- array(1:9, dim = c(3, 3, 1))
Filter <- array(c(-1, 0, 0, 1), dim = c(2, 2, 1, 1))

– 這是當步輻為1的狀況下:

Filter_size <- dim(Filter)[1]
Stride <- 1
out <- array(0, dim = c(4, 4, 1))

for (l in 1:dim(X)[3]) {
  for (k in 1:dim(Filter)[3]) {
    for (j in 1:dim(X)[2]) {
      for (i in 1:dim(X)[1]) {
        row_seq <- ((i-1) * Stride + 1):((i-1) * Stride + Filter_size)
        col_seq <- ((j-1) * Stride + 1):((j-1) * Stride + Filter_size)
        out[row_seq,col_seq,k] <- out[row_seq,col_seq,k] + X[i,j,l] * Filter[,,k,l]
      }
    }
  }
}

out
## , , 1
## 
##      [,1] [,2] [,3] [,4]
## [1,]   -1   -4   -7    0
## [2,]   -2   -4   -4    7
## [3,]   -3   -4   -4    8
## [4,]    0    3    6    9

– 這是當步輻為2的狀況下:

Filter_size <- dim(Filter)[1]
Stride <- 2
out <- array(0, dim = c(6, 6, 1))

for (l in 1:dim(X)[3]) {
  for (k in 1:dim(Filter)[3]) {
    for (j in 1:dim(X)[2]) {
      for (i in 1:dim(X)[1]) {
        row_seq <- ((i-1) * Stride + 1):((i-1) * Stride + Filter_size)
        col_seq <- ((j-1) * Stride + 1):((j-1) * Stride + Filter_size)
        out[row_seq,col_seq,k] <- out[row_seq,col_seq,k] + X[i,j,l] * Filter[,,k,l]
      }
    }
  }
}

out
## , , 1
## 
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]   -1    0   -4    0   -7    0
## [2,]    0    1    0    4    0    7
## [3,]   -2    0   -5    0   -8    0
## [4,]    0    2    0    5    0    8
## [5,]   -3    0   -6    0   -9    0
## [6,]    0    3    0    6    0    9

反卷積層(3)

X <- array(1:18, dim = c(3, 3, 2))
Filter <- array(c(-1, 0, 0, 1, 0, 1, -1, 0), dim = c(2, 2, 1, 2))

– 這是當步輻為1的狀況下:

Filter_size <- dim(Filter)[1]
Stride <- 1
out <- array(0, dim = c(4, 4, 1))

for (l in 1:dim(X)[3]) {
  for (k in 1:dim(Filter)[3]) {
    for (j in 1:dim(X)[2]) {
      for (i in 1:dim(X)[1]) {
        row_seq <- ((i-1) * Stride + 1):((i-1) * Stride + Filter_size)
        col_seq <- ((j-1) * Stride + 1):((j-1) * Stride + Filter_size)
        out[row_seq,col_seq,k] <- out[row_seq,col_seq,k] + X[i,j,l] * Filter[,,k,l]
      }
    }
  }
}

out
## , , 1
## 
##      [,1] [,2] [,3] [,4]
## [1,]   -1  -14  -20  -16
## [2,]    8   -2   -2  -10
## [3,]    8   -2   -2  -10
## [4,]   12   18   24    9

– 這是當步輻為2的狀況下:

Filter_size <- dim(Filter)[1]
Stride <- 2
out <- array(0, dim = c(6, 6, 1))

for (l in 1:dim(X)[3]) {
  for (k in 1:dim(Filter)[3]) {
    for (j in 1:dim(X)[2]) {
      for (i in 1:dim(X)[1]) {
        row_seq <- ((i-1) * Stride + 1):((i-1) * Stride + Filter_size)
        col_seq <- ((j-1) * Stride + 1):((j-1) * Stride + Filter_size)
        out[row_seq,col_seq,k] <- out[row_seq,col_seq,k] + X[i,j,l] * Filter[,,k,l]
      }
    }
  }
}

out
## , , 1
## 
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]   -1  -10   -4  -13   -7  -16
## [2,]   10    1   13    4   16    7
## [3,]   -2  -11   -5  -14   -8  -17
## [4,]   11    2   14    5   17    8
## [5,]   -3  -12   -6  -15   -9  -18
## [6,]   12    3   15    6   18    9

利用卷積與反卷積做出自編碼器(1)

# Encoder

data <- mx.symbol.Variable('data')

conv1 <- mx.symbol.Convolution(data = data, kernel = c(7, 7), stride = c(7, 7), num_filter = 8, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')

conv2 <- mx.symbol.Convolution(data = relu1, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')

encoder <- mx.symbol.Convolution(data = relu2, kernel = c(2, 2), stride = c(2, 2), num_filter = 32, name = 'encoder')

# Decoder

deconv3 <- mx.symbol.Deconvolution(data = encoder, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'deconv3')
relu3 <- mx.symbol.Activation(data = deconv3, act_type = "relu", name = 'relu3')

deconv4 <- mx.symbol.Deconvolution(data = relu3, kernel = c(2, 2), stride = c(2, 2), num_filter = 8, name = 'deconv4')
relu4 <- mx.symbol.Activation(data = deconv4, act_type = "relu", name = 'relu4')

decoder <- mx.symbol.Deconvolution(data = relu4, kernel = c(7, 7), stride = c(7, 7), num_filter = 1, name = 'decoder')

# MSE loss

label <- mx.symbol.Variable(name = 'label')

residual <- mx.symbol.broadcast_minus(lhs = label, rhs = decoder) 
square_residual <- mx.symbol.square(data = residual)
mean_square_residual <- mx.symbol.mean(data = square_residual, axis = 0:3, keepdims = FALSE)
mse_loss <- mx.symbol.MakeLoss(data = mean_square_residual, name = 'mse')
model <- my.model.FeedForward.create(Iterator = my_iter1, ctx = mx.cpu(), save.grad = FALSE,
                                     loss_symbol = mse_loss, pred_symbol = decoder,
                                     Optimizer = my_optimizer, num_round = 20)

利用卷積與反卷積做出自編碼器(2)

unzip_pred <- predict(model, Test.X)
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0

library(imager)

par(mar=rep(0,4), mfcol = c(4, 5))

for (i in 1:10) {
  
  plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
  rasterImage(t(Test.X[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
  
  plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
  rasterImage(t(unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
  
}

練習2:學習不使用MxNet進行反卷積操作

– 這是壓縮模型:

all_layers <- model$symbol$get.internals()
encoder_output <- which(all_layers$outputs == 'encoder_output') %>% all_layers$get.output()

encoder_model <- model
encoder_model$symbol <- encoder_output
encoder_model$arg.params <- encoder_model$arg.params[names(encoder_model$arg.params) %in% names(mx.symbol.infer.shape(encoder_output, data = c(28, 28, 1, 7))$arg.shapes)]
encoder_model$aux.params <- encoder_model$aux.params[names(encoder_model$aux.params) %in% names(mx.symbol.infer.shape(encoder_output, data = c(28, 28, 1, 7))$aux.shapes)]

– 這是解壓縮模型:

data <- mx.symbol.Variable('data')

deconv3 <- mx.symbol.Deconvolution(data = data, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'deconv3')
relu3 <- mx.symbol.Activation(data = deconv3, act_type = "relu", name = 'relu3')

deconv4 <- mx.symbol.Deconvolution(data = relu3, kernel = c(2, 2), stride = c(2, 2), num_filter = 8, name = 'deconv4')
relu4 <- mx.symbol.Activation(data = deconv4, act_type = "relu", name = 'relu4')

decoder_output <- mx.symbol.Deconvolution(data = relu4, kernel = c(7, 7), stride = c(7, 7), num_filter = 1, name = 'decoder')

decoder_model <- model
decoder_model$symbol <- decoder_output
decoder_model$arg.params <- decoder_model$arg.params[names(decoder_model$arg.params) %in% names(mx.symbol.infer.shape(decoder_output, data = c(1, 1, 32, 1))$arg.shapes)]
decoder_model$aux.params <- decoder_model$aux.params[names(decoder_model$aux.params) %in% names(mx.symbol.infer.shape(decoder_output, data = c(1, 1, 32, 1))$aux.shapes)]
img_input <- Test.X[,,,1]
dim(img_input) <- c(28, 28, 1, 1)

Input <- predict(encoder_model, img_input)
dim(Input)
## [1]  1  1 32  1
Output <- predict(decoder_model, Input)
dim(Output)
## [1] 28 28  1  1

練習2答案

DECONV_func <- function (X, WEIGHT, STRIDE) {
  
  original_size <- dim(X)[1]
  out <- array(0, dim = c(original_size * STRIDE, original_size * STRIDE, dim(WEIGHT)[3], dim(X)[4]))
  
  for (m in 1:dim(X)[4]) {
    for (l in 1:dim(X)[3]) {
      for (k in 1:dim(WEIGHT)[3]) {
        for (j in 1:dim(X)[2]) {
          for (i in 1:dim(X)[1]) {
            row_seq <- ((i-1) * STRIDE + 1):((i-1) * STRIDE + STRIDE)
            col_seq <- ((j-1) * STRIDE + 1):((j-1) * STRIDE + STRIDE)
            out[row_seq,col_seq,k,m] <- out[row_seq,col_seq,k,m] + X[i,j,l,m] * WEIGHT[,,k,l]
          }
        }
      }
    }
  }
  
  return(out)
  
}

deconv3_out <- DECONV_func(X = Input, WEIGHT = as.array(decoder_model$arg.params$deconv3_weight), STRIDE = 2)
relu3_out <- deconv3_out
relu3_out[relu3_out < 0] <- 0

deconv4_out <- DECONV_func(X = relu3_out, WEIGHT = as.array(decoder_model$arg.params$deconv4_weight), STRIDE = 2)
relu4_out <- deconv4_out
relu4_out[relu4_out < 0] <- 0

My_Output <- DECONV_func(X = relu4_out, WEIGHT = as.array(decoder_model$arg.params$decoder_weight), STRIDE = 7)
library(imager)

par(mar=rep(0,4), mfcol = c(1, 2))

plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
Output[Output > 1] <- 1
Output[Output < 0] <- 0
rasterImage(t(Output[,,,1]), 0, 0, 1, 1, interpolate = FALSE)

plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
My_Output[My_Output > 1] <- 1
My_Output[My_Output < 0] <- 0
rasterImage(t(My_Output[,,,1]), 0, 0, 1, 1, interpolate = FALSE)

自編碼器的進階運用(1)

– 另外我們也了解到,透過這種方式訓練的「Encoder」,它確實能把數據做「壓縮/降維」,並且這些「降維」後的數據是有辦法還原成原始圖像的,這也說明了雖然我們看不懂「Encoder」的輸出,但它肯定存在某種意義

– 下圖是我們試圖了解不同數字經過「Encoder」編碼過後的向量在空間中的相對位置,我們發現不同數字存在群聚關係(為了將數據從32維打到2維空間,我們這裡使用了PCA降維技術):

zip_code <- predict(encoder_model, Train.X)
dim(zip_code) <- dim(zip_code)[3:4]
zip_code <- t(zip_code)

PCA_result <- princomp(zip_code, cor = TRUE)

plot(PCA_result$scores[,1], PCA_result$scores[,2],
     xlab = 'Comp.1', ylab = 'Comp.2',
     pch = 19, cex = 0.5, col = rainbow(10)[Train.Y + 1])

legend('topright', legend = 0:9, pch = 19, col = rainbow(10))

自編碼器的進階運用(2)

sub_Train.DAT <- Train.DAT[1:500,]

fwrite(x = sub_Train.DAT,
       file = 'data/sub_train_data.csv',
       col.names = FALSE, row.names = FALSE)
my_iterator_func2 <- setRefClass("Custom_Iter2",
                                fields = c("iter", "data.csv", "data.shape", "batch.size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, data.csv, data.shape, batch.size){
                                    csv_iter <- mx.io.CSVIter(data.csv = data.csv, data.shape = data.shape, batch.size = batch.size)
                                    .self$iter <- csv_iter
                                    .self
                                  },
                                  value = function(){
                                    val <- as.array(.self$iter$value()$data)
                                    val.x <- val[-1,]
                                    dim(val.x) <- c(28, 28, 1, ncol(val.x))
                                    val.x <- val.x/255
                                    val.x <- mx.nd.array(val.x)
                                    val.y <- t(model.matrix(~ -1 + factor(val[1,], levels = 0:9)))
                                    val.y <- array(val.y, dim = c(10, dim(val.x)[4]))
                                    val.y <- mx.nd.array(val.y)
                                    list(data=val.x, label=val.y)
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter2 = my_iterator_func2(iter = NULL,  data.csv = 'data/sub_train_data.csv', data.shape = 785, batch.size = 20)

自編碼器的進階運用(3)

data <- mx.symbol.Variable('data')

conv1 <- mx.symbol.Convolution(data = data, kernel = c(7, 7), stride = c(7, 7), num_filter = 8, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')

conv2 <- mx.symbol.Convolution(data = relu1, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')

conv3 <- mx.symbol.Convolution(data = relu2, kernel = c(2, 2), stride = c(2, 2), num_filter = 32, name = 'conv3')

fc1 <- mx.symbol.FullyConnected(data = conv3, num.hidden = 10, name = 'fc1')
softmax <- mx.symbol.softmax(data = fc1, axis = 1, name = 'softmax')

label <- mx.symbol.Variable(name = 'label')

eps <- 1e-8
m_log <- 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax + eps), label))
m_logloss <- mx.symbol.MakeLoss(m_log, name = 'm_logloss')
my_optimizer <- mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999, wd = 1e-4)
my.eval.metric.loss <- mx.metric.custom(
  name = "mlog-loss", 
  function(real, pred) {
    return(pred)
  }
)

mx.set.seed(0)

model.1 <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter2, optimizer = my_optimizer,
                                       eval.metric = my.eval.metric.loss,
                                       array.batch.size = 20, ctx = mx.cpu(), num.round = 100)

自編碼器的進階運用(4)

model.1$symbol <- softmax

predict_Y <- predict(model.1, Test.X)
confusion_table <- table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.7761905
print(confusion_table)
##     Test.Y
##         0    1    2    3    4    5    6    7    8    9
##   1  1385    0   26   49   11   48   65   16   25    8
##   2     0 1644   35    8    3    1    7   31   23    1
##   3     4   63 1281  127   21   27   84   39   58    6
##   4    24   15   78 1341    6  109    2   83   78   27
##   5     2    0    9    0 1055    6   53    5    6   99
##   6    85    6   26   87   24 1123   20   14  155   34
##   7    76    1  100   16   31   40 1381    0   18    0
##   8     2   10   12   30   21   15    0 1314    5  180
##   9    31   96   57   50   33  139   44   14 1247   18
##   10   54   16   32   34  401   43    5  237   60 1269

自編碼器的進階運用(5)

mx.set.seed(0)
new_arg <- mxnet:::mx.model.init.params(symbol = m_logloss,
                                        input.shape = list(data = c(28, 28, 1, 7), label = c(10, 7)),
                                        output.shape = NULL,
                                        initializer = mxnet:::mx.init.uniform(0.01),
                                        ctx = mx.cpu())

for (k in 1:6) {
  new_arg$arg.params[[k]] <- encoder_model$arg.params[[k]]
}

model.2 <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter2, optimizer = my_optimizer,
                                       eval.metric = my.eval.metric.loss,
                                       arg.params = new_arg$arg.params,
                                       array.batch.size = 20, ctx = mx.cpu(), num.round = 100)
model.2$symbol <- softmax

predict_Y <- predict(model.2, Test.X)
confusion_table <- table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.847381
print(confusion_table)
##     Test.Y
##         0    1    2    3    4    5    6    7    8    9
##   1  1485    0   14   12    8   38   44   12   16   10
##   2     0 1761   21    4   14    4    7   22   22   15
##   3     6    2 1375   54   11   13   15   33   24    0
##   4     6   18   51 1456   17   91    3   44   63   42
##   5     3    1   25    0 1311   33   35   31    3  119
##   6    45   12   22   92   11 1202   43    3   94   14
##   7    58    5   22   18   17   34 1450    1    2    1
##   8     1    7   51   19    4   13    5 1496   34   72
##   9    45   45   56   54   54  104   37   12 1364   33
##   10   14    0   19   33  159   19   22   99   53 1336

練習3:欠完備的自編碼器與完備自編碼器於預測能力的差異

– 但一般的卷積網路通常都比較大,這樣encoder對於數據就不存在壓縮的效果了,把自編碼器的概念擴展到一般的卷積網路會有同樣優勢嗎?

– 這裡我們同樣運用小sample做實驗,我們重新做一個convolutional filter的數量的網路來訓練:

data <- mx.symbol.Variable('data')

# first conv
conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 16, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
pool1 <- mx.symbol.Pooling(data = relu1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool1')

# second conv
conv2 <- mx.symbol.Convolution(data = pool1, kernel = c(5, 5), num_filter = 32, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
pool2 <- mx.symbol.Pooling(data = relu2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool2')

# third conv
conv3 <- mx.symbol.Convolution(data = pool2, kernel = c(4, 4), num_filter = 128, name = 'conv3')
relu3 <- mx.symbol.Activation(data = conv3, act_type = "relu", name = 'relu3')

# Softmax

fc1 <- mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc1')
softmax <- mx.symbol.softmax(data = fc1, axis = 1, name = 'softmax')

label <- mx.symbol.Variable(name = 'label')

eps <- 1e-8
m_log <- 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax + eps), label))
m_logloss <- mx.symbol.MakeLoss(m_log, name = 'm_logloss')

model.3 <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter2, optimizer = my_optimizer,
                                       eval.metric = my.eval.metric.loss,
                                       array.batch.size = 20, ctx = mx.cpu(), num.round = 100)
model.3$symbol <- softmax

predict_Y <- predict(model.3, Test.X)
confusion_table <- table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.9005952
print(confusion_table)
##     Test.Y
##         0    1    2    3    4    5    6    7    8    9
##   1  1511    0    5    6    3   10   11    6    3    8
##   2     0 1792   11    3    4   10    6    6    9    2
##   3     5    7 1443   50    5    0    1   30   12    1
##   4     0    6   27 1569    0   46    0   30   30   22
##   5     1    2   21    3 1322   13   18    4    0   55
##   6    18    6    2   57    5 1406   25   17   56   11
##   7    77    3   25    3   22   24 1587    0    4    0
##   8     2    9   55   12    2    3    0 1496   29   36
##   9    34   20   46   26   25   32   11    2 1506    9
##   10   15    6   21   13  218    7    2  162   26 1498

練習3答案(1)

# Encoder

data <- mx.symbol.Variable('data')

conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 16, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
pool1 <- mx.symbol.Pooling(data = relu1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool1')

conv2 <- mx.symbol.Convolution(data = pool1, kernel = c(5, 5), num_filter = 32, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
pool2 <- mx.symbol.Pooling(data = relu2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool2')

conv3 <- mx.symbol.Convolution(data = pool2, kernel = c(4, 4), num_filter = 128, name = 'conv3')
encoder <- mx.symbol.Activation(data = conv3, act_type = "relu", name = 'encoder')

# Decoder

deconv4 <- mx.symbol.Deconvolution(data = encoder, kernel = c(2, 2), stride = c(2, 2), num_filter = 32, name = 'deconv4')
relu4 <- mx.symbol.Activation(data = deconv4, act_type = "relu", name = 'relu4')

deconv5 <- mx.symbol.Deconvolution(data = relu4, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'deconv5')
relu5 <- mx.symbol.Activation(data = deconv5, act_type = "relu", name = 'relu5')

decoder <- mx.symbol.Deconvolution(data = relu5, kernel = c(7, 7), stride = c(7, 7), num_filter = 1, name = 'decoder')

# MSE loss

label <- mx.symbol.Variable(name = 'label')

residual <- mx.symbol.broadcast_minus(lhs = label, rhs = decoder) 
square_residual <- mx.symbol.square(data = residual)
mean_square_residual <- mx.symbol.mean(data = square_residual, axis = 0:3, keepdims = FALSE)
mse_loss <- mx.symbol.MakeLoss(data = mean_square_residual, name = 'mse')
model <- my.model.FeedForward.create(Iterator = my_iter1, ctx = mx.cpu(), save.grad = FALSE,
                                     loss_symbol = mse_loss, pred_symbol = decoder,
                                     Optimizer = my_optimizer, num_round = 20)

練習3答案(2)

data <- mx.symbol.Variable('data')

# first conv
conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 16, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
pool1 <- mx.symbol.Pooling(data = relu1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool1')

# second conv
conv2 <- mx.symbol.Convolution(data = pool1, kernel = c(5, 5), num_filter = 32, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
pool2 <- mx.symbol.Pooling(data = relu2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool2')

# third conv
conv3 <- mx.symbol.Convolution(data = pool2, kernel = c(4, 4), num_filter = 128, name = 'conv3')
relu3 <- mx.symbol.Activation(data = conv3, act_type = "relu", name = 'relu3')

# Softmax
fc1 <- mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc1')
softmax <- mx.symbol.softmax(data = fc1, axis = 1, name = 'softmax')

label <- mx.symbol.Variable(name = 'label')

eps <- 1e-8
m_log <- 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax + eps), label))
m_logloss <- mx.symbol.MakeLoss(m_log, name = 'm_logloss')

mx.set.seed(0)
new_arg <- mxnet:::mx.model.init.params(symbol = m_logloss,
                                        input.shape = list(data = c(28, 28, 1, 7), label = c(10, 7)),
                                        output.shape = NULL,
                                        initializer = mxnet:::mx.init.uniform(0.01),
                                        ctx = mx.cpu())

for (k in 1:6) {
  new_arg$arg.params[[k]] <- model$arg.params[[k]]
}

model.4 <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter2, optimizer = my_optimizer,
                                       eval.metric = my.eval.metric.loss,
                                       arg.params = new_arg$arg.params,
                                       array.batch.size = 20, ctx = mx.cpu(), num.round = 100)
model.4$symbol <- softmax

predict_Y <- predict(model.4, Test.X)
confusion_table <- table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.9269643
print(confusion_table)
##     Test.Y
##         0    1    2    3    4    5    6    7    8    9
##   1  1565    0    6    7    1   11   11    1    6    8
##   2     0 1804    8    0    8    0    4    5    8    2
##   3     2    5 1513   23    8    1   10   20   17    1
##   4     8   11   31 1622    0   40    0   26   59   22
##   5     2    3   17    0 1432   11   20    9    3   39
##   6     8    2    4   38    4 1437   25    1   36    8
##   7    48    6   17    8   17    7 1560    0    5    0
##   8    12    4   36   14    2    3    0 1622   18   20
##   9    13   15   20   15   17   27   30    3 1486   10
##   10    5    1    4   15  117   14    1   66   37 1532

堆疊自編碼器(1)

F10_4

堆疊自編碼器(2)

F10_6

F10_5

– 當然,隨著時代演進,我們手上有眾多的工具用來解決梯度消失問題,而這整個過程非常的費力,所以現在已經幾乎沒有人用這個方式來訓練網路了。但透過自編碼器的輔助進行轉移特徵學習仍然是一個重要的應用方式,這「有機會」能增加最終模型的準確性!

結語

– 自編碼器其實還有非常多種類,像是「去噪自編碼器」(給輸入的圖像增加一些雜訊,而輸出保持原樣)以及「稀疏自編碼器」(限制Encoder的輸出,讓他們幾乎都是0。實現的方式很簡單,只要在損失函數中加上對Encoder的輸出的限制即可)等。實驗時都可以試著去用不同的自編碼器進行轉移特徵學習,以解決權重初始化問題。

– 解除馬賽克同樣也是一種自編碼模型,你現在是否能想像了?

F10_1